In [1]:
import pandas as pd
df = {}
In [2]:
from clustergrammer2 import net
>> clustergrammer2 backend version 0.4.0
In [3]:
import ipywidgets as widgets
import numpy as np
from bqplot import pyplot as plt
import bqplot
In [4]:
from copy import deepcopy
In [5]:
from glob import glob

Load X-shift cluster ID

In [6]:
df['cell_type'] = pd.read_excel('../data/big_data/mmc2__codex_cell_type_info.xlsx', index_col=0)
df['cell_type'].shape
Out[6]:
(58, 1)
In [7]:
ct_dict = {}
cell_types = []
for inst_id in df['cell_type'].index.tolist():
    inst_ct = df['cell_type']['Imaging phenotype (cell type)'].loc[inst_id]
    cell_types.append(inst_ct)
    ct_dict[inst_id] = inst_ct
In [8]:
cell_types = sorted(list(set(cell_types)))
print(len(cell_types))
cell_types
28
Out[8]:
['B cells',
 'B220(+) DN T cells',
 'CD106(+)CD16/32(+)CD31(+) stroma',
 'CD106(+)CD16/32(+)CD31(-)Ly6C(-) stroma',
 'CD106(+)CD16/32(-)Ly6C(+)CD31(+)',
 'CD106(-)CD16/32(+)Ly6C(+)CD31(-)',
 'CD106(-)CD16/32(-)Ly6C(+)CD31(+) stroma',
 'CD11c(+) B cells',
 'CD3(+) other markers (-)',
 'CD31(hi) vascular',
 'CD4(+) T cells',
 'CD4(+)CD8(-)cDC',
 'CD4(+)MHCII(+)',
 'CD4(-)CD8(+)cDC',
 'CD4(-)CD8(-) cDC',
 'CD8(+) T cells',
 'ERTR7(+) stroma',
 'F4/80(+) mphs',
 'FDCs',
 'NK cells',
 'capsule',
 'dirt',
 'erythroblasts',
 'granulocytes',
 'marginal zone mphs',
 'megakaryocytes',
 'noid',
 'plasma cells']
In [9]:
x_dim = 1200
y_dim = 1000

Check available Files

In [10]:
glob('../data/big_data/*.txt')
Out[10]:
['../data/big_data/channelNames_BALBcMRLdataset.txt']
In [11]:
glob('../data/big_data/*.csv')
Out[11]:
['../data/big_data/CODEX_MRLdataset_neighborhood_graph.csv',
 '../data/big_data/BALBcMRLdataset_exposuretimes.csv',
 '../data/big_data/Suppl.Table2.CODEX_paper_MRLdatasetexpression.csv']

Expression Data

See http://welikesharingdata.blob.core.windows.net/forshare/index.html

  • CSV file contains mean marker intensities and X, Y, Z corrdinates for each cell relative to the top left corner of its tile.
  • Tile IDs correspond to tile images sorted in the alphanumeric order
  • Size parameter corresponds to the cell object sizes in voxels
  • "imaging phenotype Cluster ID" column specifies the phenotypic cluster identity as determined by X-shift
  • "niche Cluster ID" column specifies the i-niche cluster determined by K-means (K=100)
  • neighborhood graph files describes which pairs of cells are adjacent to one another
In [12]:
%%time
df['exp'] = pd.read_csv('../data/big_data/Suppl.Table2.CODEX_paper_MRLdatasetexpression.csv')
new_rows = ['C-' + str(x) for x in df['exp'].index.tolist()]
df['exp'].index = new_rows
print(df['exp'].shape)
(734101, 38)
CPU times: user 9.83 s, sys: 616 ms, total: 10.4 s
Wall time: 9.84 s
In [13]:
df['exp'].head()
Out[13]:
Imaging phenotype cluster ID CD45 Ly6C TCR Ly6G CD19 CD169 CD106 CD3 CD1632 ... CD44 NKp46 X.X Y.Y Z.Z MHCII blank_Cy3_cyc15 blank_Cy5_cyc15 sample_Xtile_Ytile niche cluster ID
C-0 9600 1577.675415 -154.301758 130.692184 -4.168493 560.691345 -504.231476 854.670105 -631.294189 385.935242 ... 422.408691 515.130066 10 70 13 6712.812988 1665.967896 398.348389 BALBc-3_X05_Y03 32.0
C-1 9600 1017.838440 -93.069397 144.076584 40.010998 885.595520 -391.357544 62.764454 -474.201172 -469.634583 ... 448.701660 171.880310 1000 294 12 2024.678711 1287.959229 421.991425 BALBc-3_X01_Y02 99.0
C-2 9600 5978.459961 -330.099365 139.631744 -82.840302 1747.897583 -395.508820 954.326782 -1026.204468 3744.718262 ... 2229.804443 512.220764 1003 107 8 8647.193359 2817.173828 709.545105 BALBc-2_X05_Y04 74.0
C-3 9600 6119.109375 -54.384808 -768.871704 25.625927 1065.311890 -485.535431 538.404175 -611.836426 865.842590 ... 665.720459 351.108246 1003 113 13 4838.463379 1646.660278 408.523590 BALBc-2_X03_Y04 98.0
C-4 9600 6272.474609 -235.512405 74.058075 -101.729919 1186.295044 -782.744995 1261.625366 -915.565552 1361.536011 ... 2065.742676 259.003235 1003 148 9 5092.891602 2161.109131 713.416199 BALBc-2_X02_Y01 71.0

5 rows × 38 columns

In [14]:
df['exp'].columns.tolist()
Out[14]:
['Imaging phenotype cluster ID',
 'CD45',
 'Ly6C',
 'TCR',
 'Ly6G',
 'CD19',
 'CD169',
 'CD106',
 'CD3',
 'CD1632',
 'CD8a',
 'CD90',
 'F480',
 'CD11c',
 'Ter119',
 'CD11b',
 'IgD',
 'CD27',
 'CD5',
 'CD79b',
 'CD71',
 'CD31',
 'CD4',
 'IgM',
 'B220',
 'ERTR7',
 'CD35',
 'CD2135',
 'CD44',
 'NKp46',
 'X.X',
 'Y.Y',
 'Z.Z',
 'MHCII',
 'blank_Cy3_cyc15',
 'blank_Cy5_cyc15',
 'sample_Xtile_Ytile',
 'niche cluster ID']

Add Sample Column

In [15]:
sample_list = [x.split('_')[0] for x in list(df['exp']['sample_Xtile_Ytile'].get_values())]
ser_sample = pd.Series(sample_list, name='sample_slide', index=df['exp'].index.tolist())
In [16]:
print(len(sorted(list(set(sample_list)))))
list_slides = sorted(list(set(sample_list)))
list_slides
9
Out[16]:
['BALBc-1',
 'BALBc-2',
 'BALBc-3',
 'MRL-4',
 'MRL-5',
 'MRL-6',
 'MRL-7',
 'MRL-8',
 'MRL-9']
In [17]:
df['exp']['sample_slide'] = ser_sample
In [18]:
ser_sample.value_counts()
Out[18]:
BALBc-1    85572
BALBc-2    85298
BALBc-3    83787
MRL-7      83335
MRL-8      82626
MRL-9      81156
MRL-6      81052
MRL-4      77848
MRL-5      73427
Name: sample_slide, dtype: int64
In [19]:
cols = df['exp'].columns.tolist()
cols
Out[19]:
['Imaging phenotype cluster ID',
 'CD45',
 'Ly6C',
 'TCR',
 'Ly6G',
 'CD19',
 'CD169',
 'CD106',
 'CD3',
 'CD1632',
 'CD8a',
 'CD90',
 'F480',
 'CD11c',
 'Ter119',
 'CD11b',
 'IgD',
 'CD27',
 'CD5',
 'CD79b',
 'CD71',
 'CD31',
 'CD4',
 'IgM',
 'B220',
 'ERTR7',
 'CD35',
 'CD2135',
 'CD44',
 'NKp46',
 'X.X',
 'Y.Y',
 'Z.Z',
 'MHCII',
 'blank_Cy3_cyc15',
 'blank_Cy5_cyc15',
 'sample_Xtile_Ytile',
 'niche cluster ID',
 'sample_slide']
In [20]:
exp_cols = cols[1:30]
exp_cols
Out[20]:
['CD45',
 'Ly6C',
 'TCR',
 'Ly6G',
 'CD19',
 'CD169',
 'CD106',
 'CD3',
 'CD1632',
 'CD8a',
 'CD90',
 'F480',
 'CD11c',
 'Ter119',
 'CD11b',
 'IgD',
 'CD27',
 'CD5',
 'CD79b',
 'CD71',
 'CD31',
 'CD4',
 'IgM',
 'B220',
 'ERTR7',
 'CD35',
 'CD2135',
 'CD44',
 'NKp46']
In [21]:
unique_dict = {}
for inst_col in cols:
    inst_list_unique = list(df['exp'][inst_col].unique())
    unique_dict[inst_col] = inst_list_unique    
    inst_num_unique = len(inst_list_unique)
    print(inst_col, inst_num_unique)
Imaging phenotype cluster ID 58
CD45 726877
Ly6C 731257
TCR 731035
Ly6G 729442
CD19 726981
CD169 730498
CD106 727870
CD3 730526
CD1632 728506
CD8a 728773
CD90 728172
F480 731091
CD11c 725943
Ter119 728879
CD11b 725897
IgD 730293
CD27 727817
CD5 730295
CD79b 727003
CD71 729364
CD31 725039
CD4 730112
IgM 727925
B220 729995
ERTR7 729781
CD35 728492
CD2135 729290
CD44 727003
NKp46 723719
X.X 1342
Y.Y 1006
Z.Z 15
MHCII 729269
blank_Cy3_cyc15 721818
blank_Cy5_cyc15 720065
sample_Xtile_Ytile 565
niche cluster ID 101
sample_slide 9

Select Single Image Tile

BALBc: normal tissue MRL/lpr: spleen from animals with systemic autoimmune disease

Start with: 'BALBc-1_X01_Y01'

In [22]:
df_list = []

for inst_tile in ['BALBc-1_X01_Y01', 'BALBc-1_X02_Y01', 'BALBc-1_X01_Y02', 'BALBc-1_X02_Y02']:
# for inst_tile in ['BALBc-1_X01_Y01', 'BALBc-1_X02_Y01']:
# for inst_tile in ['BALBc-1_X01_Y02', 'BALBc-1_X02_Y02']:    
    keep_rows = []

    ser_tile = df['exp']['sample_Xtile_Ytile']
    ser_found = ser_tile[ser_tile == inst_tile]
    
    keep_rows.extend(ser_found.index.tolist())
    inst_df = df['exp'].loc[keep_rows].transpose()
    print('inst_df', inst_df.shape)
    
    if 'X02' in inst_tile:
        inst_df.loc['X.X'] = inst_df.loc['X.X'] + 1350

    if 'Y01' in inst_tile:        
        inst_df.loc['Y.Y'] = 2000 - inst_df.loc['Y.Y']
    
    if 'Y02' in inst_tile:
        inst_df.loc['Y.Y'] = 1000 - inst_df.loc['Y.Y'] 
        
    df_list.append(inst_df)
    
    
# df['tile'] = df['exp'].loc[keep_rows].transpose()
df['tile'] = pd.concat(df_list, axis=1)
print(df['tile'].shape)

cats = df['tile'].loc['Imaging phenotype cluster ID']
cats = [ct_dict[x] for x in cats]

new_cols = []

cols = df['tile'].columns.tolist()
for index in range(len(cols)):
    
    new_col = (cols[index], 'Cell Type: ' + str(cats[index]))
    new_cols.append(new_col)
df['tile'].columns = new_cols
df['tile'].shape
inst_df (39, 1127)
inst_df (39, 1247)
inst_df (39, 1481)
inst_df (39, 1291)
(39, 5146)
Out[22]:
(39, 5146)

Plot Tile Expression Levels

In [23]:
df['tile-exp-ini'] = df['tile'].loc[exp_cols]
df['tile-exp-ini'].shape
Out[23]:
(29, 5146)

Set Negative Expression Levels to Zero

In [24]:
df['tile-exp'] = df['tile-exp-ini']
df['tile-exp'][df['tile-exp'] < 0] = 0
In [25]:
df['tile-exp'].transpose().describe()
Out[25]:
CD45 Ly6C TCR Ly6G CD19 CD169 CD106 CD3 CD1632 CD8a ... CD71 CD31 CD4 IgM B220 ERTR7 CD35 CD2135 CD44 NKp46
count 5146.0 5146.0 5146.0 5146.0 5146.0 5146.0 5146.0 5146.0 5146.0 5146.0 ... 5146.0 5146.0 5146.0 5146.0 5146.0 5146.0 5146.0 5146.0 5146.0 5146.0
unique 4726.0 2276.0 3872.0 3101.0 4709.0 2543.0 4739.0 1769.0 4661.0 3991.0 ... 3788.0 4900.0 3512.0 4588.0 4188.0 4417.0 4711.0 4118.0 4356.0 4926.0
top 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
freq 421.0 2871.0 1274.0 2046.0 437.0 2604.0 408.0 3378.0 486.0 1156.0 ... 1358.0 244.0 1635.0 559.0 959.0 729.0 436.0 1029.0 791.0 221.0

4 rows × 29 columns

In [26]:
df['tile-exp'][df['tile-exp'] > 5000] = 5000
In [27]:
df['tile-exp'].shape
Out[27]:
(29, 5146)
In [28]:
df['tile'].loc['Z.Z'].head()
Out[28]:
(C-643, Cell Type: B cells)     9
(C-1264, Cell Type: B cells)    9
(C-1589, Cell Type: B cells)    9
(C-1611, Cell Type: B cells)    8
(C-1873, Cell Type: B cells)    6
Name: Z.Z, dtype: object

Location of Cells

In [29]:
df['tile-loc'] = df['tile'].loc[['X.X', 'Y.Y']].transpose()
df['tile-loc'].shape
Out[29]:
(5146, 2)
In [30]:
# df['tile-loc']['Y.Y'] = 1000 - df['tile-loc']['Y.Y'] 
In [31]:
df['tile-loc'] = df['tile-loc'].astype('int')
In [32]:
def set_expression_opacity(inst_gene):
    ser_opacity = df['tile-exp'].loc[inst_gene]
    list_opacity = [float(x/ser_opacity.max()) for x in list(ser_opacity.get_values())]
    scatter.default_opacities = list_opacity
In [33]:
fig = plt.figure(title='Scatter')
def_tt = bqplot.Tooltip(fields=['name'], formats=[''])

scatter = plt.scatter(df['tile-loc']['X.X'], 
                      df['tile-loc']['Y.Y'], 
                      figsize=(20,10), 
                      ylim=(0,1000), 
                      xlim=(0,1000), stroke='black', 
                      tooltip=def_tt, 
                      names=df['tile-loc'].index.tolist(), 
                      display_names=False)

inst_width = 900
fig.layout.min_height = str(inst_width/1.2) + 'px'
fig.layout.min_width = str(inst_width) + 'px'

set_expression_opacity('NKp46')

scatter.default_size = 100
scatter.colors = ['red']

Using RGB to HEX

https://www.rgbtohex.net/

In [34]:
cat_colors = {}
# A 
cat_colors['NK cells'] = '#FB0006'
# B
cat_colors['granulocytes'] = '#FA1400'
# C 
cat_colors['CD4(-)CD8(-) cDC'] = '#FC4B08'
# D 
cat_colors['B220(+) DN T cells'] = '#FD8007'
# E 
cat_colors['plasma cells'] = '#FDBA0A'
# F 
cat_colors['F4/80(+) mphs'] = '#FFF80B'
# G 
cat_colors['FDCs'] = '#FC9CA0'
# H
cat_colors['CD11c(+) B cells'] = '#99FF06'
# I 
cat_colors['capsule'] = '#68FF0A'
# J 
cat_colors['marginal zone mphs'] = '#0A4600'
# K 
cat_colors['noid'] = '#25FF04'
# L 
cat_colors['B cells'] = '#FFFF09'
# M
cat_colors['erythroblasts'] = '#1FFF3C'
# N
cat_colors['CD106(+)CD16/32(+)CD31(+) stroma'] = '#23FF6D'
# O
cat_colors['CD4(-)CD8(+)cDC'] = '#23FFA3'
# P 
cat_colors['CD106(-)CD16/32(-)Ly6C(+)CD31(+) stroma'] = '#20FFDD'
# Q
cat_colors['megakaryocytes'] = '#1CE5FF'
# R 
cat_colors['CD106(-)CD16/32(+)Ly6C(+)CD31(-)'] = '#15A7FF'
# S
cat_colors['CD4(+) T cells'] = '#0D6FFF'
# T
cat_colors['CD4(+)MHCII(+)'] = '#0137FF'
# Ud
cat_colors['CD31(hi) vascular'] = '#0000FF'
# V
cat_colors['CD3(+) other markers (-)'] = '#0700FF'
# W
cat_colors['CD106(+)CD16/32(+)CD31(-)Ly6C(-) stroma'] = '#2D00FF'
# X
cat_colors['CD8(+) T cells'] = '#5900FF'
# Y
cat_colors['ERTR7(+) stroma'] = '#8C00FE'
# Z
cat_colors['CD106(+)CD16/32(-)Ly6C(+)CD31(+)'] = '#C300FF'
# [
cat_colors['CD4(+)CD8(-)cDC'] = '#FB00FA'
# dirt
cat_colors['dirt'] = '#BCBCBC'

Expression Levels

In [35]:
keep_cols = [x for x in df['tile-exp'].columns.tolist() if 'dirt' not in x[1]]
df['tile-exp-clean'] = df['tile-exp'][keep_cols]
df['tile-exp-clean'].shape
Out[35]:
(29, 5018)
In [36]:
net.load_df(df['tile-exp-clean'])
net.set_cat_colors(axis='col', cat_colors=cat_colors, cat_index=1, cat_title='Cell Type')
In [37]:
net.load_df(df['tile-exp-clean'])
net.widget()

Voronoi Plot

In [38]:
from scipy.spatial import Voronoi
vor = Voronoi(df['tile-loc'])
In [39]:
point_list = df['tile-loc'].index.tolist()
point_names = [x[0] for x in point_list]
cat_names   = [x[1].split(': ')[1] for x in point_list]
In [40]:
patch_data = {}
patch_data['x'] = []
patch_data['y'] = []
patch_data['colors'] = []
region_labels = []

region_point_dict = {}
for point_index in range(vor.point_region.shape[0]):
    region_index = vor.point_region[point_index]
    region_point_dict[region_index] = point_index

for region_index in range(len(vor.regions)):
    
    inst_region = vor.regions[region_index]

    if -1 not in inst_region and len(inst_region) > 0:

        point_index = region_point_dict[region_index]
        point_cat = cat_names[point_index]
        region_labels.append(point_cat)
        
        # save cat_colors
        inst_color = cat_colors[point_cat]
        patch_data['colors'].append(inst_color)
        
        x_list = []
        y_list = []
        for inst_vertex in inst_region:
            inst_pos = vor.vertices[inst_vertex]
            x_list.append(inst_pos[0])
            y_list.append(inst_pos[1])
            
        patch_data['x'].append(x_list)
        patch_data['y'].append(y_list)    
In [41]:
import bqplot.pyplot as plt
fig = plt.figure(animation_duration=1000)
patch = plt.plot([], [], 
                 fill='inside',
                 fill_colors=patch_data['colors'],
                 stroke_width=1,
                 close_path=True,
                 labels=region_labels,
                 tooltip=def_tt,
                 axes_options={'x': {'visible': False}, 'y': {'visible': False}},
                )

scatter = plt.scatter(df['tile-loc']['X.X'], 
                      df['tile-loc']['Y.Y'],
#                       tooltip=def_tt, 
                      names=point_names,
                      display_names=False, default_size=2)


inst_width = 950
fig.layout.min_height = str(inst_width/(1.15)) + 'px'
fig.layout.min_width  = str(inst_width) + 'px'

patch.x = patch_data['x']
patch.y = patch_data['y']

plt.xlim(0, 2.0*x_dim)
plt.ylim(0, 2.0*y_dim)

fig
# plt.show()
In [42]:
def mouseover_highlight(self, target):
    # print('cat name', target['data']['name'])
    list_opacities = []
    for inst_label in region_labels:
        inst_opacity = 0.25
        if inst_label == target['data']['name']:
            inst_opacity = 1
        list_opacities.append(inst_opacity)

    self.opacities = list_opacities
In [43]:
def reset_highlight(self, target):
#     print('CLICKING')
    list_opacities = [1 for x in region_labels]
    self.opacities = list_opacities
In [44]:
patch.on_hover(mouseover_highlight)
patch.on_element_click(reset_highlight)
In [45]:
cells = df['tile-loc'].index.tolist()
len(cells)
Out[45]:
5146
In [46]:
df['tile-loc'].transpose()
Out[46]:
(C-643, Cell Type: B cells) (C-1264, Cell Type: B cells) (C-1589, Cell Type: B cells) (C-1611, Cell Type: B cells) (C-1873, Cell Type: B cells) (C-1877, Cell Type: B cells) (C-1935, Cell Type: B cells) (C-2478, Cell Type: B cells) (C-2866, Cell Type: B cells) (C-2988, Cell Type: B cells) ... (C-250718, Cell Type: NK cells) (C-251502, Cell Type: noid) (C-252813, Cell Type: noid) (C-252972, Cell Type: noid) (C-253114, Cell Type: noid) (C-253125, Cell Type: noid) (C-253249, Cell Type: noid) (C-254494, Cell Type: CD4(+)MHCII(+)) (C-254542, Cell Type: CD4(+)MHCII(+)) (C-254598, Cell Type: CD4(+)MHCII(+))
X.X 10 1006 101 101 1013 1013 1013 1020 1025 1026 ... 1875 2358 1418 2137 2219 2221 1445 1749 1940 1429
Y.Y 1723 1510 1496 1241 1746 1707 1086 1975 1872 1391 ... 990 871 213 408 415 230 582 288 599 460

2 rows × 5146 columns

In [47]:
from scipy.spatial.distance import pdist, squareform
In [48]:
mat_cd = pdist(df['tile-loc'], metric='euclidean')
In [49]:
dist_mat = squareform(mat_cd)
dist_mat.shape
Out[49]:
(5146, 5146)
In [50]:
df['dist'] = pd.DataFrame(data=dist_mat, index=cells, columns=cells)
In [51]:
df['dist'].head()
Out[51]:
(C-643, Cell Type: B cells) (C-1264, Cell Type: B cells) (C-1589, Cell Type: B cells) (C-1611, Cell Type: B cells) (C-1873, Cell Type: B cells) (C-1877, Cell Type: B cells) (C-1935, Cell Type: B cells) (C-2478, Cell Type: B cells) (C-2866, Cell Type: B cells) (C-2988, Cell Type: B cells) ... (C-250718, Cell Type: NK cells) (C-251502, Cell Type: noid) (C-252813, Cell Type: noid) (C-252972, Cell Type: noid) (C-253114, Cell Type: noid) (C-253125, Cell Type: noid) (C-253249, Cell Type: noid) (C-254494, Cell Type: CD4(+)MHCII(+)) (C-254542, Cell Type: CD4(+)MHCII(+)) (C-254598, Cell Type: CD4(+)MHCII(+))
(C-643, Cell Type: B cells) 0.000000 1018.520987 244.560831 490.515035 1003.263674 1003.127609 1188.182646 1040.963016 1025.878160 1068.868561 ... 2003.874747 2497.800633 2064.597782 2500.670710 2567.205679 2667.877433 1833.331939 2254.627685 2233.444873 1899.665760
(C-1264, Cell Type: B cells) 1018.520987 0.000000 905.108281 944.132406 236.103791 197.124326 424.057779 465.210705 362.498276 120.668969 ... 1012.699857 1495.401284 1360.864799 1579.102593 1634.134021 1764.830020 1026.598753 1430.151391 1304.713378 1132.002208
(C-1589, Cell Type: B cells) 244.560831 905.108281 0.000000 255.000000 945.644754 936.090273 999.921997 1036.340678 997.573055 930.940385 ... 1844.752558 2341.938086 1838.634820 2308.471356 2377.916104 2469.241989 1625.340580 2043.322784 2046.101171 1684.304011
(C-1611, Cell Type: B cells) 490.515035 944.132406 255.000000 0.000000 1042.482134 1024.158191 925.077835 1176.144974 1118.899906 937.083241 ... 1791.668775 2287.126800 1670.710328 2199.814765 2273.367546 2348.727528 1496.869066 1903.710325 1947.841113 1540.631364
(C-1873, Cell Type: B cells) 1003.263674 236.103791 945.644754 1042.482134 0.000000 39.000000 660.000000 229.106962 126.570139 355.237948 ... 1146.551351 1604.571594 1585.595787 1747.461015 1796.106066 1938.432356 1241.579639 1633.236052 1474.767100 1351.610891

5 rows × 5146 columns

In [53]:
# df['dist'][inst_col].hist()
In [55]:
# ser_hood = inst_ser[inst_ser < 100]
In [57]:
# ser_hoodod.shape
In [58]:
num_nn = 15
list_cell_density = []
list_nn_dist = []
for inst_col in df['dist'].columns.tolist():
    # skip self distance of zero
    inst_ser = df['dist'][inst_col].sort_values()[1:]
    
    ser_nn = inst_ser[:num_nn]
    
    # find nearest neighbor cell type distribution
    nn_names = ser_nn.index.tolist()
    nn_cats = [x[1].split(': ')[1] for x in nn_names]
    nn_dist = pd.Series(nn_cats).value_counts()/num_nn
    
    # number of cells in 100px neighborhood
    cell_density = inst_ser[inst_ser < 100].shape[0]
    list_cell_density.append(cell_density)

    nn_dist.name = (inst_col[0], inst_col[1], 'Density: ' + str(cell_density))
    list_nn_dist.append(nn_dist)

df['nn-dist'] = pd.concat(list_nn_dist, axis=1, sort=True).fillna(0)    

All Cells

In [59]:
net.load_df(df['nn-dist'])
net.widget()
In [60]:
net.load_df(df['nn-dist'])
net.normalize(axis='row', norm_type='zscore')
net.clip(-5,5)
net.widget()

B Cells

In [61]:
keep_cols = [x for x in df['nn-dist'].columns.tolist() if x[1].split(': ')[1] == 'B cells']
len(keep_cols)
Out[61]:
2826
In [62]:
# net.load_df(df['nn-dist'][keep_cols])
# net.normalize(axis='row', norm_type='zscore')
# net.clip(-5,5)
# net.widget()

Erythroblasts

In [63]:
# keep_cols = [x for x in df['nn-dist'].columns.tolist() if x[1].split(': ')[1] == 'erythroblasts']
# len(keep_cols)
In [64]:
# net.load_df(df['nn-dist'][keep_cols])
# net.normalize(axis='row', norm_type='zscore')
# net.clip(-5,5)
# net.widget()

Merge Expression Levels and Neighborhood Distributions

In [65]:
df['nn-dist'].shape
Out[65]:
(27, 5146)
In [66]:
df['nn-dist'].columns.tolist()[0]
Out[66]:
('C-643', 'Cell Type: B cells', 'Density: 14')
In [67]:
dist_dict = {}
for inst_col in df['nn-dist'].columns.tolist():
    dist_dict[inst_col[0]] = inst_col[2]
In [68]:
df['tile-exp-clean'].shape
Out[68]:
(29, 5018)

Add Cell Density to Expression Levels

In [69]:
df['exp-common'] = deepcopy(df['tile-exp-clean'])
exp_cols = [(x[0], x[1], dist_dict[x[0]]) for x in df['exp-common'].columns]
df['exp-common'].columns = exp_cols
df['exp-common'].index = [(x, 'Type: Expression') for x in df['exp-common'].index]
df['exp-common'].shape
Out[69]:
(29, 5018)

Find Common Cells

In [70]:
df['nn-dist-common'] = df['nn-dist'][exp_cols]
df['nn-dist-common'].index = [(x, 'Type: Neighborhood') for x in df['nn-dist-common'].index]
df['nn-dist-common'].shape
Out[70]:
(27, 5018)
In [71]:
df['merge'] = pd.concat([df['exp-common'], df['nn-dist-common']], axis=0)
df['merge'].shape
Out[71]:
(56, 5018)

Merge Dist and Expression

In [72]:
keep_cols = [x for x in df['merge'].columns.tolist() if x[1].split(': ')[1] == 'B cells']
len(keep_cols)
Out[72]:
2826
In [73]:
net.load_df(df['merge'][keep_cols])
net.normalize(axis='row', norm_type='zscore')
net.clip(-5,5)
net.widget()

'CD4(+) T cells'

In [74]:
keep_cols = [x for x in df['merge'].columns.tolist() if x[1].split(': ')[1] == 'CD4(+) T cells']
len(keep_cols)
Out[74]:
378
In [75]:
net.load_df(df['merge'][keep_cols])
net.filter_threshold(inst_rc='row', num_occur=1, threshold=0.001)
net.normalize(axis='row', norm_type='zscore')
net.clip(-5,5)
net.widget()

'marginal zone mphs'

In [76]:
keep_cols = [x for x in df['merge'].columns.tolist() if x[1].split(': ')[1] == 'marginal zone mphs']
len(keep_cols)
Out[76]:
151
In [77]:
net.load_df(df['merge'][keep_cols])
net.filter_threshold(inst_rc='row', num_occur=1, threshold=0.001)
net.normalize(axis='row', norm_type='zscore')
net.clip(-5,5)
net.widget()
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
ser_density = pd.Series(list_cell_density)
ser_density.hist()
In [ ]:
nn_names = ser_nn.index.tolist()
nn_cats = [x[1].split(': ')[1] for x in nn_names]
nn_dist = pd.Series(nn_cats).value_counts()/num_nn
nn_dist
In [ ]:
inst_ser.head()
In [ ]:
inst_ser[1:].head()
In [ ]:
 
In [ ]:
 
In [ ]:
print(cells[0])
df['dist'][cells[0]].sort_values()
In [ ]: